#ifndef WOLF_AES_CL
#define WOLF_AES_CL

#ifdef cl_amd_media_ops2
#   pragma OPENCL EXTENSION cl_amd_media_ops2 : enable

#   define xmrig_amd_bfe(src0, src1, src2) amd_bfe(src0, src1, src2)
#else
/* taken from: https://www.khronos.org/registry/OpenCL/extensions/amd/cl_amd_media_ops2.txt
 *     Built-in Function:
 *     uintn amd_bfe (uintn src0, uintn src1, uintn src2)
 *   Description
 *     NOTE: operator >> below represent logical right shift
 *     offset = src1.s0 & 31;
 *     width = src2.s0 & 31;
 *     if width = 0
 *         dst.s0 = 0;
 *     else if (offset + width) < 32
 *         dst.s0 = (src0.s0 << (32 - offset - width)) >> (32 - width);
 *     else
 *         dst.s0 = src0.s0 >> offset;
 *     similar operation applied to other components of the vectors
 */
inline int xmrig_amd_bfe(const uint src0, const uint offset, const uint width)
{
    /* casts are removed because we can implement everything as uint
     * int offset = src1;
     * int width = src2;
     * remove check for edge case, this function is always called with
     * `width==8`
     * @code
     *   if ( width == 0 )
     *      return 0;
     * @endcode
     */
    if ((offset + width) < 32u) {
        return (src0 << (32u - offset - width)) >> (32u - width);
    }

    return src0 >> offset;
}
#endif


// AES table - the other three are generated on the fly

static const __constant uint AES0_C[256] =
{
    0xA56363C6U, 0x847C7CF8U, 0x997777EEU, 0x8D7B7BF6U,
    0x0DF2F2FFU, 0xBD6B6BD6U, 0xB16F6FDEU, 0x54C5C591U,
    0x50303060U, 0x03010102U, 0xA96767CEU, 0x7D2B2B56U,
    0x19FEFEE7U, 0x62D7D7B5U, 0xE6ABAB4DU, 0x9A7676ECU,
    0x45CACA8FU, 0x9D82821FU, 0x40C9C989U, 0x877D7DFAU,
    0x15FAFAEFU, 0xEB5959B2U, 0xC947478EU, 0x0BF0F0FBU,
    0xECADAD41U, 0x67D4D4B3U, 0xFDA2A25FU, 0xEAAFAF45U,
    0xBF9C9C23U, 0xF7A4A453U, 0x967272E4U, 0x5BC0C09BU,
    0xC2B7B775U, 0x1CFDFDE1U, 0xAE93933DU, 0x6A26264CU,
    0x5A36366CU, 0x413F3F7EU, 0x02F7F7F5U, 0x4FCCCC83U,
    0x5C343468U, 0xF4A5A551U, 0x34E5E5D1U, 0x08F1F1F9U,
    0x937171E2U, 0x73D8D8ABU, 0x53313162U, 0x3F15152AU,
    0x0C040408U, 0x52C7C795U, 0x65232346U, 0x5EC3C39DU,
    0x28181830U, 0xA1969637U, 0x0F05050AU, 0xB59A9A2FU,
    0x0907070EU, 0x36121224U, 0x9B80801BU, 0x3DE2E2DFU,
    0x26EBEBCDU, 0x6927274EU, 0xCDB2B27FU, 0x9F7575EAU,
    0x1B090912U, 0x9E83831DU, 0x742C2C58U, 0x2E1A1A34U,
    0x2D1B1B36U, 0xB26E6EDCU, 0xEE5A5AB4U, 0xFBA0A05BU,
    0xF65252A4U, 0x4D3B3B76U, 0x61D6D6B7U, 0xCEB3B37DU,
    0x7B292952U, 0x3EE3E3DDU, 0x712F2F5EU, 0x97848413U,
    0xF55353A6U, 0x68D1D1B9U, 0x00000000U, 0x2CEDEDC1U,
    0x60202040U, 0x1FFCFCE3U, 0xC8B1B179U, 0xED5B5BB6U,
    0xBE6A6AD4U, 0x46CBCB8DU, 0xD9BEBE67U, 0x4B393972U,
    0xDE4A4A94U, 0xD44C4C98U, 0xE85858B0U, 0x4ACFCF85U,
    0x6BD0D0BBU, 0x2AEFEFC5U, 0xE5AAAA4FU, 0x16FBFBEDU,
    0xC5434386U, 0xD74D4D9AU, 0x55333366U, 0x94858511U,
    0xCF45458AU, 0x10F9F9E9U, 0x06020204U, 0x817F7FFEU,
    0xF05050A0U, 0x443C3C78U, 0xBA9F9F25U, 0xE3A8A84BU,
    0xF35151A2U, 0xFEA3A35DU, 0xC0404080U, 0x8A8F8F05U,
    0xAD92923FU, 0xBC9D9D21U, 0x48383870U, 0x04F5F5F1U,
    0xDFBCBC63U, 0xC1B6B677U, 0x75DADAAFU, 0x63212142U,
    0x30101020U, 0x1AFFFFE5U, 0x0EF3F3FDU, 0x6DD2D2BFU,
    0x4CCDCD81U, 0x140C0C18U, 0x35131326U, 0x2FECECC3U,
    0xE15F5FBEU, 0xA2979735U, 0xCC444488U, 0x3917172EU,
    0x57C4C493U, 0xF2A7A755U, 0x827E7EFCU, 0x473D3D7AU,
    0xAC6464C8U, 0xE75D5DBAU, 0x2B191932U, 0x957373E6U,
    0xA06060C0U, 0x98818119U, 0xD14F4F9EU, 0x7FDCDCA3U,
    0x66222244U, 0x7E2A2A54U, 0xAB90903BU, 0x8388880BU,
    0xCA46468CU, 0x29EEEEC7U, 0xD3B8B86BU, 0x3C141428U,
    0x79DEDEA7U, 0xE25E5EBCU, 0x1D0B0B16U, 0x76DBDBADU,
    0x3BE0E0DBU, 0x56323264U, 0x4E3A3A74U, 0x1E0A0A14U,
    0xDB494992U, 0x0A06060CU, 0x6C242448U, 0xE45C5CB8U,
    0x5DC2C29FU, 0x6ED3D3BDU, 0xEFACAC43U, 0xA66262C4U,
    0xA8919139U, 0xA4959531U, 0x37E4E4D3U, 0x8B7979F2U,
    0x32E7E7D5U, 0x43C8C88BU, 0x5937376EU, 0xB76D6DDAU,
    0x8C8D8D01U, 0x64D5D5B1U, 0xD24E4E9CU, 0xE0A9A949U,
    0xB46C6CD8U, 0xFA5656ACU, 0x07F4F4F3U, 0x25EAEACFU,
    0xAF6565CAU, 0x8E7A7AF4U, 0xE9AEAE47U, 0x18080810U,
    0xD5BABA6FU, 0x887878F0U, 0x6F25254AU, 0x722E2E5CU,
    0x241C1C38U, 0xF1A6A657U, 0xC7B4B473U, 0x51C6C697U,
    0x23E8E8CBU, 0x7CDDDDA1U, 0x9C7474E8U, 0x211F1F3EU,
    0xDD4B4B96U, 0xDCBDBD61U, 0x868B8B0DU, 0x858A8A0FU,
    0x907070E0U, 0x423E3E7CU, 0xC4B5B571U, 0xAA6666CCU,
    0xD8484890U, 0x05030306U, 0x01F6F6F7U, 0x120E0E1CU,
    0xA36161C2U, 0x5F35356AU, 0xF95757AEU, 0xD0B9B969U,
    0x91868617U, 0x58C1C199U, 0x271D1D3AU, 0xB99E9E27U,
    0x38E1E1D9U, 0x13F8F8EBU, 0xB398982BU, 0x33111122U,
    0xBB6969D2U, 0x70D9D9A9U, 0x898E8E07U, 0xA7949433U,
    0xB69B9B2DU, 0x221E1E3CU, 0x92878715U, 0x20E9E9C9U,
    0x49CECE87U, 0xFF5555AAU, 0x78282850U, 0x7ADFDFA5U,
    0x8F8C8C03U, 0xF8A1A159U, 0x80898909U, 0x170D0D1AU,
    0xDABFBF65U, 0x31E6E6D7U, 0xC6424284U, 0xB86868D0U,
    0xC3414182U, 0xB0999929U, 0x772D2D5AU, 0x110F0F1EU,
    0xCBB0B07BU, 0xFC5454A8U, 0xD6BBBB6DU, 0x3A16162CU
};

#define BYTE(x, y) (xmrig_amd_bfe((x), (y) << 3U, 8U))

#if (ALGO == ALGO_CN_HEAVY_TUBE)
inline uint4 AES_Round_bittube2(const __local uint *AES0, const __local uint *AES1, uint4 x, uint4 k)
{
    x = ~x;
    k.s0 ^= AES0[BYTE(x.s0, 0)] ^ AES1[BYTE(x.s1, 1)] ^ rotate(AES0[BYTE(x.s2, 2)] ^ AES1[BYTE(x.s3, 3)], 16U);
    x.s0 ^= k.s0;
    k.s1 ^= AES0[BYTE(x.s1, 0)] ^ AES1[BYTE(x.s2, 1)] ^ rotate(AES0[BYTE(x.s3, 2)] ^ AES1[BYTE(x.s0, 3)], 16U);
    x.s1 ^= k.s1;
    k.s2 ^= AES0[BYTE(x.s2, 0)] ^ AES1[BYTE(x.s3, 1)] ^ rotate(AES0[BYTE(x.s0, 2)] ^ AES1[BYTE(x.s1, 3)], 16U);
    x.s2 ^= k.s2;
    k.s3 ^= AES0[BYTE(x.s3, 0)] ^ AES1[BYTE(x.s0, 1)] ^ rotate(AES0[BYTE(x.s1, 2)] ^ AES1[BYTE(x.s2, 3)], 16U);
    return k;
}
#endif

uint4 AES_Round(const __local uint *AES0, const __local uint *AES1, const __local uint *AES2, const __local uint *AES3, const uint4 X, uint4 key)
{
    key.s0 ^= AES0[BYTE(X.s0, 0)] ^ AES1[BYTE(X.s1, 1)] ^ AES2[BYTE(X.s2, 2)] ^ AES3[BYTE(X.s3, 3)];
    key.s1 ^= AES0[BYTE(X.s1, 0)] ^ AES1[BYTE(X.s2, 1)] ^ AES2[BYTE(X.s3, 2)] ^ AES3[BYTE(X.s0, 3)];
    key.s2 ^= AES0[BYTE(X.s2, 0)] ^ AES1[BYTE(X.s3, 1)] ^ AES2[BYTE(X.s0, 2)] ^ AES3[BYTE(X.s1, 3)];
    key.s3 ^= AES0[BYTE(X.s3, 0)] ^ AES1[BYTE(X.s0, 1)] ^ AES2[BYTE(X.s1, 2)] ^ AES3[BYTE(X.s2, 3)];

    return key;
}

uint4 AES_Round_Two_Tables(const __local uint *AES0, const __local uint *AES1, const uint4 X, uint4 key)
{
    key.s0 ^= AES0[BYTE(X.s0, 0)] ^ AES1[BYTE(X.s1, 1)] ^ rotate(AES0[BYTE(X.s2, 2)] ^ AES1[BYTE(X.s3, 3)], 16U);
    key.s1 ^= AES0[BYTE(X.s1, 0)] ^ AES1[BYTE(X.s2, 1)] ^ rotate(AES0[BYTE(X.s3, 2)] ^ AES1[BYTE(X.s0, 3)], 16U);
    key.s2 ^= AES0[BYTE(X.s2, 0)] ^ AES1[BYTE(X.s3, 1)] ^ rotate(AES0[BYTE(X.s0, 2)] ^ AES1[BYTE(X.s1, 3)], 16U);
    key.s3 ^= AES0[BYTE(X.s3, 0)] ^ AES1[BYTE(X.s0, 1)] ^ rotate(AES0[BYTE(X.s1, 2)] ^ AES1[BYTE(X.s2, 3)], 16U);

    return key;
}


static const __constant uchar rcon[8] = { 0x8d, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40 };


static const __constant uchar sbox[256] =
{
    0x63, 0x7C, 0x77, 0x7B, 0xF2, 0x6B, 0x6F, 0xC5, 0x30, 0x01, 0x67, 0x2B, 0xFE, 0xD7, 0xAB, 0x76,
    0xCA, 0x82, 0xC9, 0x7D, 0xFA, 0x59, 0x47, 0xF0, 0xAD, 0xD4, 0xA2, 0xAF, 0x9C, 0xA4, 0x72, 0xC0,
    0xB7, 0xFD, 0x93, 0x26, 0x36, 0x3F, 0xF7, 0xCC, 0x34, 0xA5, 0xE5, 0xF1, 0x71, 0xD8, 0x31, 0x15,
    0x04, 0xC7, 0x23, 0xC3, 0x18, 0x96, 0x05, 0x9A, 0x07, 0x12, 0x80, 0xE2, 0xEB, 0x27, 0xB2, 0x75,
    0x09, 0x83, 0x2C, 0x1A, 0x1B, 0x6E, 0x5A, 0xA0, 0x52, 0x3B, 0xD6, 0xB3, 0x29, 0xE3, 0x2F, 0x84,
    0x53, 0xD1, 0x00, 0xED, 0x20, 0xFC, 0xB1, 0x5B, 0x6A, 0xCB, 0xBE, 0x39, 0x4A, 0x4C, 0x58, 0xCF,
    0xD0, 0xEF, 0xAA, 0xFB, 0x43, 0x4D, 0x33, 0x85, 0x45, 0xF9, 0x02, 0x7F, 0x50, 0x3C, 0x9F, 0xA8,
    0x51, 0xA3, 0x40, 0x8F, 0x92, 0x9D, 0x38, 0xF5, 0xBC, 0xB6, 0xDA, 0x21, 0x10, 0xFF, 0xF3, 0xD2,
    0xCD, 0x0C, 0x13, 0xEC, 0x5F, 0x97, 0x44, 0x17, 0xC4, 0xA7, 0x7E, 0x3D, 0x64, 0x5D, 0x19, 0x73,
    0x60, 0x81, 0x4F, 0xDC, 0x22, 0x2A, 0x90, 0x88, 0x46, 0xEE, 0xB8, 0x14, 0xDE, 0x5E, 0x0B, 0xDB,
    0xE0, 0x32, 0x3A, 0x0A, 0x49, 0x06, 0x24, 0x5C, 0xC2, 0xD3, 0xAC, 0x62, 0x91, 0x95, 0xE4, 0x79,
    0xE7, 0xC8, 0x37, 0x6D, 0x8D, 0xD5, 0x4E, 0xA9, 0x6C, 0x56, 0xF4, 0xEA, 0x65, 0x7A, 0xAE, 0x08,
    0xBA, 0x78, 0x25, 0x2E, 0x1C, 0xA6, 0xB4, 0xC6, 0xE8, 0xDD, 0x74, 0x1F, 0x4B, 0xBD, 0x8B, 0x8A,
    0x70, 0x3E, 0xB5, 0x66, 0x48, 0x03, 0xF6, 0x0E, 0x61, 0x35, 0x57, 0xB9, 0x86, 0xC1, 0x1D, 0x9E,
    0xE1, 0xF8, 0x98, 0x11, 0x69, 0xD9, 0x8E, 0x94, 0x9B, 0x1E, 0x87, 0xE9, 0xCE, 0x55, 0x28, 0xDF,
    0x8C, 0xA1, 0x89, 0x0D, 0xBF, 0xE6, 0x42, 0x68, 0x41, 0x99, 0x2D, 0x0F, 0xB0, 0x54, 0xBB, 0x16
};


#define SubWord(inw) ((sbox[BYTE(inw, 3)] << 24) | (sbox[BYTE(inw, 2)] << 16) | (sbox[BYTE(inw, 1)] << 8) | sbox[BYTE(inw, 0)])


void AESExpandKey256(uint *keybuf)
{
    //#pragma unroll 4
    for (uint c = 8, i = 1; c < 40; ++c) {
        // For 256-bit keys, an sbox permutation is done every other 4th uint generated, AND every 8th
        uint t = ((!(c & 7)) || ((c & 7) == 4)) ? SubWord(keybuf[c - 1]) : keybuf[c - 1];

        // If the uint we're generating has an index that is a multiple of 8, rotate and XOR with the round constant,
        // then XOR this with previously generated uint. If it's 4 after a multiple of 8, only the sbox permutation
        // is done, followed by the XOR. If neither are true, only the XOR with the previously generated uint is done.
        keybuf[c] = keybuf[c - 8] ^ ((!(c & 7)) ? rotate(t, 24U) ^ as_uint((uchar4)(rcon[i++], 0U, 0U, 0U)) : t);
    }
}

#endif
